Import des librairies nécessaires¶

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

Pré-traitement des données¶

In [ ]:
custom_palette = sns.color_palette(["#2E8B57", "#e76f51", "#606c38"])
In [ ]:
sns.set_palette(custom_palette)
sns.set_style("white")
In [ ]:
colormap = {1.0: "purple", 2.0: "orange", 3.0: "green"}
In [ ]:
sns.set_palette(sns.color_palette("pastel"))

Ouverture du jeu de données et informations générales¶

In [ ]:
data = pd.read_csv("fetal_health.csv")
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability                    2126 non-null   float64
 9   percentage_of_time_with_abnormal_long_term_variability  2126 non-null   float64
 10  mean_value_of_long_term_variability                     2126 non-null   float64
 11  histogram_width                                         2126 non-null   float64
 12  histogram_min                                           2126 non-null   float64
 13  histogram_max                                           2126 non-null   float64
 14  histogram_number_of_peaks                               2126 non-null   float64
 15  histogram_number_of_zeroes                              2126 non-null   float64
 16  histogram_mode                                          2126 non-null   float64
 17  histogram_mean                                          2126 non-null   float64
 18  histogram_median                                        2126 non-null   float64
 19  histogram_variance                                      2126 non-null   float64
 20  histogram_tendency                                      2126 non-null   float64
 21  fetal_health                                            2126 non-null   float64
dtypes: float64(22)
memory usage: 365.5 KB
None
In [ ]:
data_with_fetal_health = data.copy()
In [ ]:
data_descriptions = {
    'baseline_value': 'Fréquence cardiaque fœtale de base (FCF)',
    'accelerations': 'Nombre d\'accélérations par seconde',
    'fetal_movement': 'Nombre de mouvements fœtaux par seconde',
    'uterine_contractions': 'Nombre de contractions utérines par seconde',
    'light_decelerations': 'Nombre de décélérations légères par seconde',
    'severe_decelerations': 'Nombre de décélérations sévères par seconde',
    'prolongued_decelerations': 'Nombre de décélérations prolongées par seconde',
    'abnormal_short_term_variability': 'Pourcentage de temps avec une variabilité à court terme anormale',
    'mean_value_of_short_term_variability': 'Valeur moyenne de la variabilité à court terme',
    'percentage_of_time_with_abnormal_long_term_variability': 'Pourcentage de temps avec une variabilité à long terme anormale',
    'mean_value_of_long_term_variability': 'Valeur moyenne de la variabilité à long terme',
    'histogram_width': 'Largeur de l\'histogramme utilisant toutes les valeurs d\'un enregistrement',
    'histogram_min': 'Valeur minimale de l\'histogramme',
    'histogram_max': 'Valeur maximale de l\'histogramme',
    'histogram_number_of_peaks': 'Nombre de pics dans l\'histogramme de l\'examen',
    'histogram_number_of_zeroes': 'Nombre de zéros dans l\'histogramme de l\'examen',
    'histogram_mode': 'Mode de l\'histogramme',
    'histogram_mean': 'Moyenne de l\'histogramme',
    'histogram_median': 'Médiane de l\'histogramme',
    'histogram_variance': 'Variance de l\'histogramme',
    'histogram_tendency': 'Tendance de l\'histogramme',
    'fetal_health': 'Santé fœtale: 1 - Normal 2 - Suspect 3 - Pathologique'
}

df_descriptions = pd.DataFrame.from_dict(data_descriptions, orient='index', columns=['Description'])
print(df_descriptions)
                                                                                          Description
baseline_value                                               Fréquence cardiaque fœtale de base (FCF)
accelerations                                                      Nombre d'accélérations par seconde
fetal_movement                                                Nombre de mouvements fœtaux par seconde
uterine_contractions                                      Nombre de contractions utérines par seconde
light_decelerations                                       Nombre de décélérations légères par seconde
severe_decelerations                                      Nombre de décélérations sévères par seconde
prolongued_decelerations                               Nombre de décélérations prolongées par seconde
abnormal_short_term_variability                     Pourcentage de temps avec une variabilité à co...
mean_value_of_short_term_variability                   Valeur moyenne de la variabilité à court terme
percentage_of_time_with_abnormal_long_term_vari...  Pourcentage de temps avec une variabilité à lo...
mean_value_of_long_term_variability                     Valeur moyenne de la variabilité à long terme
histogram_width                                     Largeur de l'histogramme utilisant toutes les ...
histogram_min                                                        Valeur minimale de l'histogramme
histogram_max                                                        Valeur maximale de l'histogramme
histogram_number_of_peaks                               Nombre de pics dans l'histogramme de l'examen
histogram_number_of_zeroes                             Nombre de zéros dans l'histogramme de l'examen
histogram_mode                                                                  Mode de l'histogramme
histogram_mean                                                               Moyenne de l'histogramme
histogram_median                                                             Médiane de l'histogramme
histogram_variance                                                          Variance de l'histogramme
histogram_tendency                                                          Tendance de l'histogramme
fetal_health                                        Santé fœtale: 1 - Normal 2 - Suspect 3 - Patho...

Ici l'on peut voir que pandas interpréte correctement les différents types de données, sauf pour fetal_health qui dans notre cas devrait être catégorielle et ordonnée (3 est plus grave que 2 qui est plus grave que 1)

Conversion de la variable fetal_health en variable qualitative ordinale¶

In [ ]:
data.fetal_health = pd.Categorical(data.fetal_health, categories=[1.0, 2.0, 3.0], ordered=True)
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype   
---  ------                                                  --------------  -----   
 0   baseline value                                          2126 non-null   float64 
 1   accelerations                                           2126 non-null   float64 
 2   fetal_movement                                          2126 non-null   float64 
 3   uterine_contractions                                    2126 non-null   float64 
 4   light_decelerations                                     2126 non-null   float64 
 5   severe_decelerations                                    2126 non-null   float64 
 6   prolongued_decelerations                                2126 non-null   float64 
 7   abnormal_short_term_variability                         2126 non-null   float64 
 8   mean_value_of_short_term_variability                    2126 non-null   float64 
 9   percentage_of_time_with_abnormal_long_term_variability  2126 non-null   float64 
 10  mean_value_of_long_term_variability                     2126 non-null   float64 
 11  histogram_width                                         2126 non-null   float64 
 12  histogram_min                                           2126 non-null   float64 
 13  histogram_max                                           2126 non-null   float64 
 14  histogram_number_of_peaks                               2126 non-null   float64 
 15  histogram_number_of_zeroes                              2126 non-null   float64 
 16  histogram_mode                                          2126 non-null   float64 
 17  histogram_mean                                          2126 non-null   float64 
 18  histogram_median                                        2126 non-null   float64 
 19  histogram_variance                                      2126 non-null   float64 
 20  histogram_tendency                                      2126 non-null   float64 
 21  fetal_health                                            2126 non-null   category
dtypes: category(1), float64(21)
memory usage: 351.1 KB
None

Affichage du nombre de valeurs différentes et du nombre de valeurs manquantes¶

In [ ]:
print(f"Nombre de valeurs différentes par variables :\n{data.nunique()}")
print()
print(f"Nombre de valeurs manquantes par variables :\n{data.isna().sum()}")
Nombre de valeurs différentes par variables :
baseline value                                             48
accelerations                                              20
fetal_movement                                            102
uterine_contractions                                       16
light_decelerations                                        16
severe_decelerations                                        2
prolongued_decelerations                                    6
abnormal_short_term_variability                            75
mean_value_of_short_term_variability                       57
percentage_of_time_with_abnormal_long_term_variability     87
mean_value_of_long_term_variability                       249
histogram_width                                           154
histogram_min                                             109
histogram_max                                              86
histogram_number_of_peaks                                  18
histogram_number_of_zeroes                                  9
histogram_mode                                             88
histogram_mean                                            103
histogram_median                                           95
histogram_variance                                        133
histogram_tendency                                          3
fetal_health                                                3
dtype: int64

Nombre de valeurs manquantes par variables :
baseline value                                            0
accelerations                                             0
fetal_movement                                            0
uterine_contractions                                      0
light_decelerations                                       0
severe_decelerations                                      0
prolongued_decelerations                                  0
abnormal_short_term_variability                           0
mean_value_of_short_term_variability                      0
percentage_of_time_with_abnormal_long_term_variability    0
mean_value_of_long_term_variability                       0
histogram_width                                           0
histogram_min                                             0
histogram_max                                             0
histogram_number_of_peaks                                 0
histogram_number_of_zeroes                                0
histogram_mode                                            0
histogram_mean                                            0
histogram_median                                          0
histogram_variance                                        0
histogram_tendency                                        0
fetal_health                                              0
dtype: int64

Ici on voit qu'aucune variable n'est constante (aucune variable n'a qu'une seule et unique valeur), et l'on remarque aussi que le jeu de données ne contient aucune valeurs vides, ce qui nous arrange pour la suite.

Exploration et visualisation des données¶

Effectifs de chaque classe¶

In [ ]:
fig, axes = plt.subplots(1, 1, figsize=(6, 4))

sns.countplot(x='fetal_health', hue="fetal_health", data=data, ax=axes)
axes.set_title('Effectifs de fetal_health')
axes.set_xlabel('Classe de fetal_health')
axes.set_ylabel('Effectif')


# plt.save_fig('repartition_classes.png')
plt.show()
No description has been provided for this image

On remarque un certain déséquilibre dans les classes de fetal_health, ce qui est normal.

Statistiques descriptives de base¶

In [ ]:
data.describe()
Out[ ]:
baseline value accelerations fetal_movement uterine_contractions light_decelerations severe_decelerations prolongued_decelerations abnormal_short_term_variability mean_value_of_short_term_variability percentage_of_time_with_abnormal_long_term_variability ... histogram_width histogram_min histogram_max histogram_number_of_peaks histogram_number_of_zeroes histogram_mode histogram_mean histogram_median histogram_variance histogram_tendency
count 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.00000 ... 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000 2126.000000
mean 133.303857 0.003178 0.009481 0.004366 0.001889 0.000003 0.000159 46.990122 1.332785 9.84666 ... 70.445908 93.579492 164.025400 4.068203 0.323612 137.452023 134.610536 138.090310 18.808090 0.320320
std 9.840844 0.003866 0.046666 0.002946 0.002960 0.000057 0.000590 17.192814 0.883241 18.39688 ... 38.955693 29.560212 17.944183 2.949386 0.706059 16.381289 15.593596 14.466589 28.977636 0.610829
min 106.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 12.000000 0.200000 0.00000 ... 3.000000 50.000000 122.000000 0.000000 0.000000 60.000000 73.000000 77.000000 0.000000 -1.000000
25% 126.000000 0.000000 0.000000 0.002000 0.000000 0.000000 0.000000 32.000000 0.700000 0.00000 ... 37.000000 67.000000 152.000000 2.000000 0.000000 129.000000 125.000000 129.000000 2.000000 0.000000
50% 133.000000 0.002000 0.000000 0.004000 0.000000 0.000000 0.000000 49.000000 1.200000 0.00000 ... 67.500000 93.000000 162.000000 3.000000 0.000000 139.000000 136.000000 139.000000 7.000000 0.000000
75% 140.000000 0.006000 0.003000 0.007000 0.003000 0.000000 0.000000 61.000000 1.700000 11.00000 ... 100.000000 120.000000 174.000000 6.000000 0.000000 148.000000 145.000000 148.000000 24.000000 1.000000
max 160.000000 0.019000 0.481000 0.015000 0.015000 0.001000 0.005000 87.000000 7.000000 91.00000 ... 180.000000 159.000000 238.000000 18.000000 10.000000 187.000000 182.000000 186.000000 269.000000 1.000000

8 rows × 21 columns

Histogrammes de chaque variable avec decoupage en fonction de la classe¶

In [ ]:
sns.set_context("notebook")
In [ ]:
sns.set_palette('viridis')
In [ ]:
for column in data.columns:
    if column != "fetal_health":
        sns.histplot(x=column, hue="fetal_health", multiple="stack", palette="Set1", data=data)
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Boxplot pour chaque variable¶

In [ ]:
# Define the number of columns per row
columns_per_row = 5

# Calculate the number of rows
num_rows = (len(data.columns) + columns_per_row - 1) // columns_per_row

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=columns_per_row, figsize=(20, 5*num_rows))

# Flatten axes if only one row
if num_rows == 1:
    axes = axes.reshape(1, -1)

# Plot boxplots
for i, column in enumerate(data.columns):
    row = i // columns_per_row
    col = i % columns_per_row
    sns.boxplot(y=column, data=data, ax=axes[row, col])
    axes[row, col].set_title(column)

# Hide empty subplots
for i in range(len(data.columns), num_rows * columns_per_row):
    row = i // columns_per_row
    col = i % columns_per_row
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()
No description has been provided for this image

Boxplot pour chaque variable, avec distinction de classe¶

In [ ]:
# Define the number of columns per row
columns_per_row = 4

# Calculate the number of rows
num_rows = (len(data.columns) + columns_per_row - 1) // columns_per_row

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=columns_per_row, figsize=(20, 5*num_rows))

# Flatten axes if only one row
if num_rows == 1:
    axes = axes.reshape(1, -1)

# Plot boxplots
for i, column in enumerate(data.columns):
    row = i // columns_per_row
    col = i % columns_per_row
    sns.boxplot(y=column, data=data, ax=axes[row, col], x=data.fetal_health)
    axes[row, col].set_title(column)

# Hide empty subplots
for i in range(len(data.columns), num_rows * columns_per_row):
    row = i // columns_per_row
    col = i % columns_per_row
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()
No description has been provided for this image

Calcul de corrélations¶

Matrice de corrélation¶

In [ ]:
corr = data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(13, 11))
plt.grid(False)
sns.heatmap(corr, mask=mask, cmap='coolwarm', linewidths=0.5, annot=True, annot_kws={"fontsize":9}, fmt='.2f')
plt.show()
No description has been provided for this image
In [ ]:
correlation_data = data_with_fetal_health.corr()['fetal_health']

# Créez un DataFrame à partir de la série de corrélation
correlation_df = pd.DataFrame(correlation_data)
correlation_df = correlation_df.drop('fetal_health')

# Tracez la heatmap de la corrélation
plt.figure(figsize=(13, 11))
plt.grid(False)
sns.heatmap(correlation_df, cmap='coolwarm', linewidths=0.5, annot=True, )
plt.title('Corrélation entre chaque variable et fetal_health')
plt.show()
No description has been provided for this image

On remarque que les variables "accelerations","prolongued_decelerations", "abnormal_short_term_variability", "percentage_of_time_with_abnormal_long_term_variability" et "mean_value_of_long_term_variability" sont celles qui sont le plus corrélé à la santé du foetus

In [ ]:
sns.scatterplot(data =data,x="accelerations",y="fetal_movement", hue="fetal_health")
plt.show()
sns.scatterplot(data =data,x="prolongued_decelerations",y="fetal_movement", hue="fetal_health")
plt.show()
No description has been provided for this image
No description has been provided for this image

Analyse en Composantes principales (ACP)¶

Pourcentages d'inertie expliquée¶

In [ ]:
data_without_class = data.drop('fetal_health', axis=1)
cls = PCA(n_components=21)
pcs = cls.fit_transform(data_without_class)
print(f"Part d'inertie expliquée par chaque composante:\n{[round(value, 3) for value in cls.explained_variance_ratio_]}")

plt.bar([f"{i}" for i in range(1, 22)], cls.explained_variance_ratio_*100)
plt.xlabel("N° d'axe factoriel")
plt.ylabel("% d'inertie expliqué")
plt.show()
Part d'inertie expliquée par chaque composante:
[0.589, 0.161, 0.097, 0.07, 0.037, 0.028, 0.006, 0.005, 0.004, 0.001, 0.001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
No description has been provided for this image

On remarque qu'à partir du 3° axe factoriel, le pourcentage d'inertie expliquée est faible voir très faible. Les axes factoriels 1 et 2 expliquent 74% de l'inertie.

Représentation dans le premier plan factoriel¶

In [ ]:
fig = px.scatter(x=pcs.T[0], y=pcs.T[1], color=data['fetal_health'], 
                 labels={'x': 'PC1', 'y': 'PC2'}, width=650)
fig.show()
In [ ]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x=pcs.T[0], y=pcs.T[1], hue=data['fetal_health'],marker="$\circ$", ec="face")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
No description has been provided for this image
In [ ]:
fig = px.scatter(x=pcs.T[1], y=pcs.T[2], color=data['fetal_health'], 
                 labels={'x': 'PC2', 'y': 'PC3'}, width=650, )
fig.show()
In [ ]:
import plotly.graph_objs as go

# Créer une trace de nuage de points en 3D
scatter = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=2,
        color=data['fetal_health'],  # Couleur basée sur la variable 'fetal_health'
        colorscale='Viridis',  # Choisissez une colormap appropriée
        opacity=0.9
    )
)

# Créer une figure
fig = go.Figure(data=[scatter])

# Mise en forme du layout
fig.update_layout(
    scene=dict(
        xaxis=dict(title='x'),
        yaxis=dict(title='y'),
        zaxis=dict(title='z')
    )
)

# Afficher la figure
fig.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 5
      1 import plotly.graph_objs as go
      3 # Créer une trace de nuage de points en 3D
      4 scatter = go.Scatter3d(
----> 5     x=x,
      6     y=y,
      7     z=z,
      8     mode='markers',
      9     marker=dict(
     10         size=2,
     11         color=data['fetal_health'],  # Couleur basée sur la variable 'fetal_health'
     12         colorscale='Viridis',  # Choisissez une colormap appropriée
     13         opacity=0.9
     14     )
     15 )
     17 # Créer une figure
     18 fig = go.Figure(data=[scatter])

NameError: name 'x' is not defined

On remarque, dans une certaine mesure que les observations de même classe sont rassemblées. Mais les clusters sont souvent confondus donc, en l'état (avec seulement une représentation dans le plan), on aura du mal à prédire la classe avec un algorithme de clustering.

ACP normalisée et pondérée¶

In [ ]:
from sklearn.preprocessing import StandardScaler

data_without_class_copy = data_without_class.copy()
effectifs = data['fetal_health'].value_counts()
scaler = StandardScaler()
# X = data_without_class_copy.to_numpy()
X = scaler.fit_transform(data_without_class_copy)
D = np.empty(len(data), dtype=float)

for i, row in enumerate(data.itertuples(index=False)):
    if row.fetal_health == 1.0:
        D[i] = 1/(3*effectifs[1.0])
    elif row.fetal_health == 2.0:
        D[i] = 1/(3*effectifs[2.0])
    elif row.fetal_health == 3.0:
        D[i] = 1/(3*effectifs[3.0])

D = np.diag(D)
V = np.matmul(X.T, np.matmul(D, X))
L, U = np.linalg.eig(V)

sorted_indices = np.argsort(L)[::-1]
L = L[sorted_indices]
U = U[:, sorted_indices]

plt.bar([f"{i}" for i in range(1, 22)], L/L.sum()*100)
plt.xlabel("N° d'axe factoriel")
plt.ylabel("% d'inertie expliqué")
#plt.savefig("%inertie_axe.png")
plt.show()

C = np.matmul(X, U)
No description has been provided for this image
In [ ]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x=C.T[0], y=C.T[1], hue=data['fetal_health'],marker="$\circ$", ec="face")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
No description has been provided for this image

Méthodes de clustering¶

Algorithmes des K-Means¶

In [ ]:
def kmeans_generator(dataset, n_clusters_list):
    for n_clusters in n_clusters_list:
        cls = KMeans(n_clusters=n_clusters, init="k-means++",)
        km = cls.fit(dataset)
        inertia = km.inertia_
        yield n_clusters, inertia
In [ ]:
gen = kmeans_generator(data_without_class, [i for i in range(2,21)],)

elbow = pd.DataFrame(gen, columns=["n_clusters", "inertia"])
elbow = elbow.astype({"n_clusters": "int32"})
sns.lineplot(elbow, x="n_clusters", y="inertia")
plt.show()